Qingyang Liu
3/8/2018
xgboost R package\[Obj^{(t)}\approx \sum_{j=1}^{|T|} \left[G_jw_j+\frac{1}{2}\left(H_j+\lambda\right)w_j^2\right]+\gamma|T|\]
xgboost will assign a direction to the missing values instead of a numerical value. Specifically, xgboost guides the data points with missing values to the left and right respectively, then choose the direction with a higher gain with regard to the objective.xgboost R Packagexgboost(data = NULL, label = NULL, missing = NA,
weight = NULL, params = list(), nrounds, ...)
databooster [default=gbtree]objective [default=reg:linear]nroundseta [default=0.3, alias: learning_rate]gamma [default=0, alias: min_split_loss]xgboost R Packagexgboost(data = NULL, label = NULL, missing = NA,
weight = NULL, params = list(), nrounds, ...)
max_depth [default=6]subsample [default=1]colsample_bytree [default=1]colsample_bylevel [default=1]lambda [default=1, alias: reg_lambda]alpha [default=0, alias: reg_alpha]df <- iris
str(df)## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
X <- as.matrix(df[,-5])
Y <- as.numeric(df[,5]) - 1
dtrain <- xgb.DMatrix(data = X, label = Y)library(xgboost)
xgbtree1 <- xgboost(data = dtrain, objective = "multi:softmax",
nrounds = 20, max.depth = 3, num_class = 3, eta = 0.2)## [1] train-merror:0.026667
## [2] train-merror:0.026667
## [3] train-merror:0.026667
## [4] train-merror:0.026667
## [5] train-merror:0.026667
## [6] train-merror:0.026667
## [7] train-merror:0.026667
## [8] train-merror:0.026667
## [9] train-merror:0.026667
## [10] train-merror:0.026667
## [11] train-merror:0.020000
## [12] train-merror:0.020000
## [13] train-merror:0.013333
## [14] train-merror:0.013333
## [15] train-merror:0.013333
## [16] train-merror:0.013333
## [17] train-merror:0.013333
## [18] train-merror:0.013333
## [19] train-merror:0.013333
## [20] train-merror:0.013333
pred1 <- predict(xgbtree1, X)
head(pred1) # class labels## [1] 0 0 0 0 0 0
xgbtree2 <- xgboost(data = dtrain, objective = "multi:softprob",
nrounds = 20, max.depth = 3, num_class = 3, eta = 0.2,
verbose = 0)
pred2 <- predict(xgbtree2, X)
head(matrix(pred2, ncol = 3, byrow = TRUE)) # class probabilities## [,1] [,2] [,3]
## [1,] 0.9757425 0.01341130 0.0108463
## [2,] 0.9762915 0.01285605 0.0108524
## [3,] 0.9762915 0.01285605 0.0108524
## [4,] 0.9762915 0.01285605 0.0108524
## [5,] 0.9757425 0.01341130 0.0108463
## [6,] 0.9757425 0.01341130 0.0108463
tree <- xgb.dump(xgbtree2, with_stats = TRUE)
tree[1:14] # check the first two trees## [1] "booster[0]"
## [2] "0:[f2<2.45] yes=1,no=2,missing=1,gain=72.2968,cover=66.6667"
## [3] "1:leaf=0.287081,cover=22.2222"
## [4] "2:leaf=-0.146699,cover=44.4444"
## [5] "booster[1]"
## [6] "0:[f2<2.45] yes=1,no=2,missing=1,gain=18.0742,cover=66.6667"
## [7] "1:leaf=-0.143541,cover=22.2222"
## [8] "2:[f3<1.75] yes=3,no=4,missing=3,gain=41.9078,cover=44.4444"
## [9] "3:[f2<4.95] yes=5,no=6,missing=5,gain=4.58985,cover=24"
## [10] "5:leaf=0.277612,cover=21.3333"
## [11] "6:leaf=-6.50233e-09,cover=2.66667"
## [12] "4:[f2<4.85] yes=7,no=8,missing=7,gain=0.635159,cover=20.4444"
## [13] "7:leaf=-5.10897e-09,cover=1.33333"
## [14] "8:leaf=-0.142541,cover=19.1111"
xgb.plot.tree(feature_names = dimnames(dtrain)[[2]], model = xgbtree2,
n_first_tree = 1, plot_width = 1000, plot_height = 500)importance <- xgb.importance(feature_names = dimnames(dtrain)[[2]],
model = xgbtree2)
xgb.ggplot.importance(importance) + theme(legend.position="none")xgb.save(xgbtree2, "xgbtree.model") # save the final model## [1] TRUE
xgbtree3 <- xgb.load("xgbtree.model") # load the saved model xgb.cv(..., nfold, metrics = list(),
early_stopping_rounds = NULL, ...)
nfoldmetricsearly_stopping_roundsxgboost(data = NULL, label = NULL, missing = NA,
weight = NULL, params = list(), nrounds, ...)
watchlistxgb.DMatrix objects, each of them tagged with a name.